import pandas as pd
import numpy as np
import os
import shap
import xgboost as xgb
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_curve, \
roc_auc_score, precision_recall_curve, confusion_matrix, r2_score
import pandas_profiling as pp
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
Read the data
!dir data
data_dir = 'data'
df = pd.read_csv(os.path.join(data_dir, 'MEPS_data_preprocessed.csv'))
df.reset_index(drop=True, inplace=True)
df.head()
Change categorical variables to strings (even though they're represented as numbers)
cats = ['REGION','MARRY31X','EDRECODE','FTSTU31X','ACTDTY31','HONRDC31',
'RTHLTH31','MNHLTH31','HIBPDX','CHDDX','ANGIDX','MIDX','OHRTDX','STRKDX',
'EMPHDX','CHBRON31','CHOLDX','CANCERDX','DIABDX','JTPAIN31','ARTHDX',
'ARTHTYPE','ASTHDX','ADHDADDX','PREGNT31','WLKLIM31','ACTLIM31','SOCLIM31',
'COGLIM31','DFHEAR42','DFSEE42','ADSMOK42','PHQ242','EMPST31','POVCAT15','INSCOV15']
df = df.copy()
for col in cats:
df[col] = df[col].astype(str)
# Show summary report
df.profile_report(title='Summary Report')
Let's have a closer look at the predicted variable (HEALTHEXP)
df['HEALTHEXP'].hist(bins=100)
df['HEALTHEXP'].describe(percentiles=np.linspace(0,1,21))
health_exp = df['HEALTHEXP'].values
plt.plot(np.linspace(0,100,21), np.percentile(health_exp, np.linspace(0,100,21)),'o');
plt.axhline(y=np.mean(health_exp), color='red', label = 'mean')
plt.xlabel('Percentile');
plt.ylabel('Healthexp');
plt.legend();
plt.plot(np.linspace(0,100,21), np.percentile(health_exp, np.linspace(0,100,21)),'o');
plt.axhline(y=np.mean(health_exp), color='red', label = 'mean')
plt.xlabel('Percentile');
plt.ylabel('Healthexp');
plt.yscale('log')
plt.legend();
plt.figure(figsize=(6,6))
df[['HEALTHEXP']].boxplot();
plt.figure(figsize=(6,6))
plt.boxplot(df['HEALTHEXP']);
plt.title('Health Exp')
plt.yscale('log')
Notes on predicted variable:
Prepare features
# Drop panel number (not meant to be predictive) and sample weights
df.drop(columns = ['PANEL', 'PERSONWT'], inplace=True)
df.head()
y = df.pop('HEALTHEXP')
One-hot encoding for all the categorical features
for col in cats:
var_one_hot = pd.get_dummies(df[col],prefix=col, drop_first = True)
df = df.drop(columns=[col])
df = pd.concat([df, var_one_hot], axis=1)
df.head()
More feature engineering in next HW ...
def train_print_save_models_reg(mods, dfTrain, dfTest, yTrain, yTest):
trained_models = {}
for model_name, mod_object in mods.items():
md = mod_object.fit(dfTrain,yTrain)
y_pred_test = md.predict(dfTest)
y_pred_test[y_pred_test<0]=0
y_pred_train = md.predict(dfTrain)
print(model_name)
print('Training R^2:', r2_score(yTrain, y_pred_train))
print('Test R^2:', r2_score(yTest, y_pred_test))
trained_models[model_name] = md
return trained_models
def train_print_save_models_clf(mods, dfTrain, dfTest, yTrain, yTest):
trained_models = {}
for model_name, mod_object in mods.items():
md = mod_object.fit(dfTrain,yTrain)
y_pred_test = md.predict(dfTest)
y_pred_train = md.predict(dfTrain)
accuracy_train = accuracy_score(yTrain,y_pred_train)
AUROC_train = roc_auc_score(yTrain,y_pred_train)
accuracy_test = accuracy_score(yTest,y_pred_test)
AUROC_test = roc_auc_score(yTest,y_pred_test)
print(model_name)
print('Train - accuracy:{}, AUROC:{}'.format(
accuracy_train, AUROC_train))
print('Test - accuracy:{}, AUROC:{}'.format(
accuracy_test, AUROC_test))
trained_models[model_name] = md
return trained_models
mods = {
'XGB':xgb.XGBRegressor(max_depth=2),
'LinReg':LinearRegression(),
'Ridge_Reg':Ridge(alpha=.5)
}
dfTrain, dfTest, yTrain, yTest = train_test_split(df, y, random_state=42)
len(dfTrain), len(dfTest)
models = train_print_save_models_reg(mods, dfTrain, dfTest, yTrain, yTest)
HW 2:
model1 = models['XGB']
model1
obs = dfTest.sample(random_state = 42)
idx = obs.index
obs
pred = model1.predict(obs)[0]
real = yTest[idx].values[0]
print('Prediction for selected observeation =', pred)
print('While the actual value = ', real)
def show_shap_for_obs(obs, model, model_type, reference=None):
if model_type == 'Tree':
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(obs)[0]
shap.waterfall_plot(explainer.expected_value, shap_values,
feature_names=list(obs.columns))
else:
explainer = shap.KernelExplainer(model.predict, reference)
shap_values = explainer.shap_values(obs)[0]
plot = shap.waterfall_plot(explainer.expected_value, shap_values,
feature_names=list(obs.columns))
show_shap_for_obs(obs, model1, 'Tree')
obs2 = dfTest.sample(random_state = 23)
obs2
show_shap_for_obs(obs, model1, 'Tree')
show_shap_for_obs(obs2, model1, 'Tree')
for col in ['AGE31X','ADSMOK42_2']:
print(col+' Patient 1: {}, Patient 2:{}'.format(obs[col].values[0], obs2[col].values[0]))
In the first observation, ADSMOK42_2 is the most important feature, while in the second AGE31X. The patient from first observation is 40 and is a smoker, while the other is 59 (considerably older) and non-smoking.
Age may be a good shot. Let's give it a try
obs3 = dfTest[dfTest['AGE31X']<10].sample(random_state = 23)
obs3
show_shap_for_obs(obs2, model1, 'Tree')
show_shap_for_obs(obs3, model1, 'Tree')
For the 59 yr old patient age has positive impact (+1509.68). The patient for which the age has negative effect (-489.92) on the prediction is very young (4 years old) thus most likely is not severely ill.
model2 = models['LinReg']
show_shap_for_obs(obs, model1, 'Tree')
show_shap_for_obs(obs, model2, 'Linear', reference=np.zeros((1,len(obs.columns))))
For the same observation, different models give different effects for the same variables. While for both models ADSMOK42_2 has positive effect (increases the treatment costs), PCS42 have positie effect in one model and negative effect in the other.
Extras, don't check.
Just out of curiosity - lets do this again, with standarized features
dfTrain, dfTest, yTrain, yTest = train_test_split(df, y, random_state=42)
scaler = StandardScaler()
scaler.fit(dfTrain)
X_train = scaler.transform(dfTrain)
X_test = scaler.transform(dfTest)
models = train_print_save_models_reg(mods, X_train, X_test, yTrain, yTest)
model = models['LinReg']
show_shap_for_obs(obs, model, 'Linear', reference=np.zeros((1,len(obs.columns))))
show_shap_for_obs(obs2, model, 'Linear', reference=np.zeros((1,len(obs.columns))))
show_shap_for_obs(obs3, model, 'Linear', reference=np.zeros((1,len(obs.columns))))
model.intercept_
coefs = model.coef_
max_ten = coefs.argsort()[-10:][::-1]
max_ten_vals = coefs[max_ten]
plt.figure(figsize=(14,3))
plt.bar(range(10),max_ten_vals)
plt.xticks(range(10), list(obs.columns[[max_ten]]));